In [1]:
# パッケージをインストール
%pip install -qe ..
[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
In [2]:
import duckdb
import japanize_matplotlib  # noqa: F401
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
In [3]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
In [5]:
 
In [4]:
plt.style.use("ggplot")
In [ ]:
 
In [5]:
with duckdb.connect("../data/raw/etl_from_sf.duckdb", read_only=True) as con:
    df = con.execute("SELECT * FROM raw_clean_with_pool").df()
df.head()
Out[5]:
hour_ts id volume_usd tvl_usd liquidity volume_token0 volume_token1 fees_usd open_price high_price ... load_ts fee_tier token0_id token0_symbol token0_name token0_decimals token1_id token1_symbol token1_name token1_decimals
0 1746784800 0xac8ba9d6fa2f9ff7c712bef5670cb0f7e4177965-485218 698.000000 2.711293e+05 3.112152e+17 6346.977788 698.000000 2.094000 9.059070 9.059070 ... 2025-05-09 11:00:09.148000+00:00 3000 0x2d8ea194902bc55431420bd26be92b0782dce91d ZND ZNDToken 18 0xdac17f958d2ee523a2206206994597c13d831ec7 USDT Tether USD 6
1 1745672400 0xfa6e8e97ececdc36302eca534f63439b1e79487b-484909 302.294707 5.368977e+04 2.713280e+13 302.346914 302.242500 0.030229 1.000449 1.000449 ... 2025-05-09 11:00:22.403000+00:00 100 0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48 USDC USD Coin 6 0xdac17f958d2ee523a2206206994597c13d831ec7 USDT Tether USD 6
2 1745672400 0x7ce04536279a74951c1a0148b5488d8a027a9bf8-484909 6.605206 2.190198e+02 4.443265e+19 82.062242 0.003694 0.066052 22165.049401 22165.049401 ... 2025-05-09 11:00:22.403000+00:00 10000 0x419c4db4b9e25d6db2ad9691ccb832c8d9fda05e DRGN Dragon 18 0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2 WETH Wrapped Ether 18
3 1744063200 0x0598690ad1daffc3ac47fd6adcaeac547c067e26-484462 0.000000 3.658733e+06 7.049047e+26 0.000000 0.000000 0.000000 1.007475 1.007475 ... 2025-05-09 11:51:11.735000+00:00 3000 0x57e114b691db790c35207b2e685d4a43181e6061 ENA ENA 18 0x8be3460a480c80728a8c4d7a5d5303c85ba7b3b9 sENA Staked ENA 18
4 1744063200 0x6288694eb218614a27777f2b52d3f8d4819233c0-484462 1697.111986 3.920453e+03 6.054509e+20 79772.446480 1.080806 16.971120 78952.973584 81377.447456 ... 2025-05-09 11:51:11.735000+00:00 10000 0xb1d1eae60eea9525032a6dcb4c1ce336a1de71be DRV Derive 18 0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2 WETH Wrapped Ether 18

5 rows × 28 columns

In [6]:
raw_clean_with_pool = df
In [7]:
# build_raw_clean_table("../data/raw", "../data/raw/etl_raw_2.duckdb")
In [8]:
# con = duckdb.connect(database="../data/raw/etl_raw_2.duckdb")
# raw_clean_with_pool = con.execute("SELECT * FROM raw_clean_with_pool").df()
# raw_clean_with_pool.head()

データ変換¶

In [9]:
# タイムスタンプを日時に変換
raw_clean_with_pool["datetime"] = pd.to_datetime(raw_clean_with_pool["hour_ts"], unit="s")
In [10]:
# IDからプールアドレスとインデックスを抽出
raw_clean_with_pool["pool_address"] = raw_clean_with_pool["id"].str.split("-").str[0]
raw_clean_with_pool["block_index"] = raw_clean_with_pool["id"].str.split("-").str[1]
In [11]:
# 重複確認と除去
print(f"重複行数: {raw_clean_with_pool.duplicated().sum()}")
raw_clean_with_pool = raw_clean_with_pool.drop_duplicates()
print(f"重複除去後の行数: {raw_clean_with_pool.shape[0]}")
重複行数: 0
重複除去後の行数: 134736
In [12]:
# フィーティアごとのプール数
fee_tier_counts = raw_clean_with_pool["fee_tier"].value_counts().reset_index()
fee_tier_counts.columns = ["fee_tier", "count"]
In [13]:
# 基本統計量の確認
raw_clean_with_pool.describe()
Out[13]:
hour_ts volume_usd tvl_usd liquidity volume_token0 volume_token1 fees_usd open_price high_price low_price close_price tx_count tick sqrt_price token0_price token1_price datetime
count 1.347360e+05 1.347360e+05 1.347360e+05 1.347360e+05 1.347360e+05 1.347360e+05 134736.000000 1.347360e+05 1.347360e+05 1.347360e+05 1.347360e+05 134736.000000 134734.000000 1.347360e+05 1.347360e+05 1.347360e+05 134736
mean 1.743539e+09 5.510828e+04 4.231219e+06 2.937187e+30 1.114310e+16 1.325911e+11 56.812143 2.525362e+51 2.525367e+51 2.525362e+51 2.525367e+51 7.497966 -58742.235197 3.037088e+44 2.525367e+51 2.553138e+45 2025-04-01 20:28:27.766298624
min 1.741126e+09 0.000000e+00 -1.122805e+05 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.000000 -887272.000000 0.000000e+00 0.000000e+00 0.000000e+00 2025-03-04 22:00:00
25% 1.741748e+09 2.596662e+01 2.170830e+04 3.601585e+17 8.994876e+00 1.078082e-01 0.087213 4.790029e-01 4.972653e-01 4.747034e-01 4.940731e-01 1.000000 -139830.250000 7.285223e+25 4.940731e-01 3.572542e-05 2025-03-12 03:00:00
50% 1.743001e+09 2.410914e+02 2.090232e+05 1.181802e+21 9.500936e+02 4.329109e+00 0.983544 1.169327e+02 1.196488e+02 1.153471e+02 1.183872e+02 2.000000 -79293.500000 1.503591e+27 1.183872e+02 8.182775e-03 2025-03-26 15:00:00
75% 1.745255e+09 1.910617e+03 1.180628e+06 3.255444e+22 3.629037e+04 1.129278e+03 7.956426 2.705184e+04 2.768699e+04 2.674672e+04 2.742095e+04 5.000000 34735.250000 4.496508e+29 2.742095e+04 1.934087e+00 2025-04-21 17:00:00
max 1.746853e+09 1.432742e+08 4.263057e+08 1.833394e+34 1.277522e+21 7.598498e+15 49084.157146 3.402568e+56 3.402568e+56 3.402568e+56 3.402568e+56 1065.000000 887271.000000 1.461447e+48 3.402568e+56 3.402568e+50 2025-05-10 05:00:00
std 1.844010e+06 8.091992e+05 2.493916e+07 2.302294e+32 3.490701e+18 2.538906e+13 578.628932 9.269685e+53 9.269685e+53 9.269685e+53 9.269685e+53 32.755489 158515.483019 2.106574e+46 9.269685e+53 9.270152e+47 NaN
In [14]:
# 数値カラムのキャスト
num_cols = [
    "volume_usd",
    "tvl_usd",
    "fees_usd",
    "open_price",
    "high_price",
    "low_price",
    "close_price",
    "liquidity",
    "volume_token0",
    "volume_token1",
    "tx_count",
    "tick",
    "sqrt_price",
]
for c in num_cols:
    raw_clean_with_pool[c] = pd.to_numeric(raw_clean_with_pool[c], errors="coerce")

異常値の検出 & クリーニング¶

In [15]:
#  clip で下限 0、 or 上限 percentile 99.9% に制限
raw_clean_with_pool["tvl_usd"] = raw_clean_with_pool["tvl_usd"].clip(lower=0)
upper = raw_clean_with_pool["volume_usd"].quantile(0.999)
raw_clean_with_pool = raw_clean_with_pool[raw_clean_with_pool["volume_usd"] <= upper]
In [16]:
# 欠損の確認(型変換で NaN が入る可能性があるため)
print(raw_clean_with_pool[num_cols].isna().sum())
raw_clean_with_pool = raw_clean_with_pool.dropna(subset=num_cols)
volume_usd       0
tvl_usd          0
fees_usd         0
open_price       0
high_price       0
low_price        0
close_price      0
liquidity        0
volume_token0    0
volume_token1    0
tx_count         0
tick             2
sqrt_price       0
dtype: int64

可視化¶

時系列分析¶

In [17]:
# 時系列での取引量推移(全体)
daily_volume = (
    raw_clean_with_pool.groupby(raw_clean_with_pool["datetime"].dt.date).agg({"volume_usd": "sum"}).reset_index()
)

plt.figure(figsize=(12, 6))
sns.lineplot(data=daily_volume, x="datetime", y="volume_usd", marker="o", markersize=4, linewidth=1.5)
plt.title("日次総取引量")
plt.xlabel("日付")
plt.ylabel("取引量 (USD)")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 主要プールの時系列分析
top_pools = raw_clean_with_pool.groupby("pool_address")["volume_usd"].sum().nlargest(5).index

for pool in top_pools:
    dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()

    # 日付で集計
    dfp["date"] = dfp["datetime"].dt.date
    dfp_agg = dfp.groupby("date")[["volume_usd", "tvl_usd"]].sum().reset_index()

    # 二軸プロット作成
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # 左軸:取引量
    color1 = "tab:blue"
    line1 = sns.lineplot(
        data=dfp_agg, x="date", y="volume_usd", marker="o", markersize=4, linewidth=1.5, color=color1, ax=ax1
    )
    ax1.set_xlabel("日付")
    ax1.set_ylabel("取引量 (USD)", color=color1)
    ax1.tick_params(axis="y", labelcolor=color1)

    # 右軸:TVL
    ax2 = ax1.twinx()
    color2 = "tab:red"
    line2 = sns.lineplot(
        data=dfp_agg, x="date", y="tvl_usd", marker="o", markersize=4, linewidth=1.5, color=color2, ax=ax2
    )
    ax2.set_ylabel("TVL (USD)", color=color2)
    ax2.tick_params(axis="y", labelcolor=color2)

    # グラフタイトルとフォーマット設定
    plt.title(f"プール {pool[:10]}… の日次推移")
    ax1.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
    plt.xticks(rotation=45)

    # 凡例追加
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, ["日次取引量", "日次TVL"], loc="upper left")

    # プロット調整と表示
    sns.despine(left=False, right=False)
    plt.tight_layout()
    plt.show()

# オプション:すべてのトッププールを1つのグラフで比較
plt.figure(figsize=(14, 8))
pool_data = []

for pool in top_pools:
    dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()
    dfp["date"] = dfp["datetime"].dt.date
    dfp_agg = dfp.groupby("date")["volume_usd"].sum().reset_index()
    dfp_agg["pool"] = pool[:8] + "..."  # アドレスの先頭8文字
    pool_data.append(dfp_agg)

all_pools_data = pd.concat(pool_data)

sns.lineplot(data=all_pools_data, x="date", y="volume_usd", hue="pool", marker="o", markersize=4)
plt.title("トッププール5件の日次取引量比較")
plt.xlabel("日付")
plt.ylabel("取引量 (USD)")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
plt.legend(title="Pool")
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [18]:
# 主要プールの時系列分析
for pool in top_pools:
    dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()

    # datetime を index にして日次リサンプリング
    dfp = dfp.set_index("datetime").sort_index()
    dfp_agg = dfp[["volume_usd", "tvl_usd"]].resample("D").sum().reset_index()

    # 二軸プロット作成
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # 左軸:取引量
    color1 = "tab:blue"
    line1 = sns.lineplot(
        data=dfp_agg, x="datetime", y="volume_usd", marker="o", markersize=4, linewidth=1.5, color=color1, ax=ax1
    )
    ax1.set_xlabel("日付")
    ax1.set_ylabel("取引量 (USD)", color=color1)
    ax1.tick_params(axis="y", labelcolor=color1)

    # 右軸:TVL
    ax2 = ax1.twinx()
    color2 = "tab:red"
    line2 = sns.lineplot(
        data=dfp_agg, x="datetime", y="tvl_usd", marker="o", markersize=4, linewidth=1.5, color=color2, ax=ax2
    )
    ax2.set_ylabel("TVL (USD)", color=color2)
    ax2.tick_params(axis="y", labelcolor=color2)

    # グラフタイトルとフォーマット設定
    plt.title(f"プール {pool[:10]}… の日次推移 (resample)")
    ax1.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
    plt.xticks(rotation=45)

    # 凡例追加
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, ["日次取引量", "日次TVL"], loc="upper left")

    # プロット調整と表示
    sns.despine(left=False, right=False)
    plt.tight_layout()
    plt.show()

# 追加オプション:すべてのトッププールを1つのグラフで比較(resampleバージョン)
plt.figure(figsize=(14, 8))
pool_data = []

for pool in top_pools:
    dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()
    # resampleを使用
    dfp = dfp.set_index("datetime").sort_index()
    dfp_agg = dfp[["volume_usd"]].resample("D").sum().reset_index()
    dfp_agg["pool"] = pool[:8] + "..."  # アドレスの先頭8文字
    pool_data.append(dfp_agg)

all_pools_data = pd.concat(pool_data)

sns.lineplot(data=all_pools_data, x="datetime", y="volume_usd", hue="pool", marker="o", markersize=4)
plt.title("トッププール5件の日次取引量比較 (resample)")
plt.xlabel("日付")
plt.ylabel("取引量 (USD)")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
plt.legend(title="Pool")
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 

特徴量エンジニアリング¶

In [20]:
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize


# プールごとに時系列特徴量を作成(この部分は同じ)
def add_time_features(group):
    # 時系列順にソート
    group = group.sort_values("datetime")

    # 移動平均(24時間)
    group["volume_ma24"] = group["volume_usd"].rolling(24).mean()

    # 移動平均からの乖離率(異常検知の重要指標)
    group["volume_deviation"] = (group["volume_usd"] / (group["volume_ma24"] + 1e-10)) - 1

    # ボラティリティ(24時間の標準偏差)
    group["volume_volatility"] = group["volume_usd"].rolling(24).std()

    # TVLに対する取引量の比率(高すぎると異常の可能性)
    group["volume_tvl_ratio"] = group["volume_usd"] / (group["tvl_usd"] + 1e-10)

    # 価格変動率
    group["price_change"] = (group["close_price"] / group["open_price"]) - 1

    return group


# 各プールに対して特徴を追加
enhanced_df = raw_clean_with_pool.groupby("pool_address").apply(add_time_features)

# 外れ値を制限(可視化のため)
upper_tvl_ratio = np.percentile(enhanced_df["volume_tvl_ratio"].dropna(), 99)
upper_deviation = np.percentile(enhanced_df["volume_deviation"].dropna(), 99)
lower_deviation = np.percentile(enhanced_df["volume_deviation"].dropna(), 1)

plot_df = enhanced_df.copy()
plot_df["volume_tvl_ratio"] = plot_df["volume_tvl_ratio"].clip(upper=upper_tvl_ratio)
plot_df["volume_deviation"] = plot_df["volume_deviation"].clip(upper=upper_deviation, lower=lower_deviation)

# 異常スコアの可視化
# カラーマップの設定
cmap = plt.cm.coolwarm
norm = Normalize(vmin=plot_df["price_change"].quantile(0.05), vmax=plot_df["price_change"].quantile(0.95))

# 散布図プロット
fig, ax = plt.subplots(figsize=(12, 8))
scatter = sns.scatterplot(
    data=plot_df,
    x="volume_tvl_ratio",
    y="volume_deviation",
    hue="price_change",
    palette=cmap,
    hue_norm=norm,
    alpha=0.7,
    edgecolor="none",
    s=50,  # ポイントサイズ
    ax=ax,  # Axesを明示的に指定
)

# カラーバーの追加
sm = ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label("価格変動率")

# グラフの設定
plt.title("取引量の異常指標", fontsize=16)
plt.xlabel("取引量/TVL比率")
plt.ylabel("移動平均からの乖離率")

# 凡例の非表示(カラーバーがあるため)
scatter.get_legend().remove()

# トゥールチップ代わりにポイントに注釈を付ける場合のサンプル(上位5件の異常値)
anomaly_score = plot_df["volume_deviation"].abs() * plot_df["volume_tvl_ratio"]
top_anomalies = plot_df.loc[anomaly_score.nlargest(5).index]

for idx, row in top_anomalies.iterrows():
    plt.annotate(
        f"Pool: {row['pool_address'][:8]}...\nDate: {row['datetime'].strftime('%Y-%m-%d %H:%M')}",
        xy=(row["volume_tvl_ratio"], row["volume_deviation"]),
        xytext=(10, 10),
        textcoords="offset points",
        bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.5),
        arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"),
    )

plt.tight_layout()
plt.show()

# オプション:異常検知の時系列分析(移動平均からの乖離率の時系列プロット)
# トップ5の異常値を持つプールを選択
top_anomaly_pools = top_anomalies["pool_address"].unique()[:3]  # 上位3つのプールを選択

plt.figure(figsize=(14, 8))
for pool in top_anomaly_pools:
    pool_data = enhanced_df[enhanced_df["pool_address"] == pool].copy()
    pool_data = pool_data.sort_values("datetime")

    sns.lineplot(data=pool_data, x="datetime", y="volume_deviation", label=f"Pool {pool[:8]}...")

plt.axhline(y=0, color="r", linestyle="--", alpha=0.3)
plt.title("移動平均からの乖離率の時系列推移(異常検知上位プール)")
plt.xlabel("日時")
plt.ylabel("移動平均からの乖離率")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
/tmp/ipykernel_72670/1444843618.py:29: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  enhanced_df = raw_clean_with_pool.groupby("pool_address").apply(add_time_features)
/usr/local/lib/python3.11/site-packages/matplotlib/colors.py:2295: RuntimeWarning: invalid value encountered in divide
  resdat /= (vmax - vmin)
No description has been provided for this image
No description has been provided for this image

多変量相関分析とプール特性の把握¶

In [21]:
# 1. 相関ヒートマップ
numeric_cols = ["volume_usd", "tvl_usd", "fees_usd", "tx_count", "liquidity", "volume_deviation", "volume_tvl_ratio"]
corr = enhanced_df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))  # 上三角形をマスク(オプション)
heatmap = sns.heatmap(
    corr,
    annot=True,  # 値を表示
    fmt=".2f",  # 小数点以下2桁表示
    cmap="coolwarm",  # カラーマップ
    vmin=-1,
    vmax=1,  # 値の範囲
    center=0,  # 中央値(カラーマップの中心)
    square=True,  # 正方形のセル
    linewidths=0.5,  # セル間の線の幅
    cbar_kws={"shrink": 0.8},  # カラーバーの設定
)
plt.title("指標間の相関係数", fontsize=16, pad=20)
plt.tight_layout()
plt.show()


# 2. フィーティア別の統計比較
def anomaly_rate(x):
    return (abs(x) > 3).mean()


fee_tier_stats = enhanced_df.groupby("fee_tier").agg(
    {
        "volume_usd": ["mean", "std", "max"],
        "tvl_usd": ["mean", "std"],
        "tx_count": "mean",
        "volume_deviation": ["mean", "std", anomaly_rate],
    }
)

# カラム名をフラット化
fee_tier_stats.columns = [
    "_".join(col).strip() if isinstance(col, tuple) else col for col in fee_tier_stats.columns.values
]
fee_tier_stats = fee_tier_stats.reset_index()

# フィーティア別の異常発生率比較
plt.figure(figsize=(10, 6))
bar = sns.barplot(data=fee_tier_stats, x="fee_tier", y="volume_deviation_anomaly_rate", palette="viridis")

# 値を棒グラフの上に表示
for i, p in enumerate(bar.patches):
    height = p.get_height()
    bar.text(p.get_x() + p.get_width() / 2.0, height + 0.01, f"{height:.2%}", ha="center", fontsize=10)

plt.title("フィーティア別の異常発生率(volume_deviation > 3σ)", fontsize=14)
plt.xlabel("フィーティア")
plt.ylabel("異常発生率")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

# 3. トークンペアの組み合わせ分析
token_pairs = enhanced_df.groupby(["token0_symbol", "token1_symbol"]).size().reset_index(name="count")
top_pairs = token_pairs.sort_values("count", ascending=False).head(15)

# 表示用のラベルにトークンペアを結合
top_pairs["pair_label"] = top_pairs["token0_symbol"] + "-" + top_pairs["token1_symbol"]

plt.figure(figsize=(12, 10))
bars = sns.barplot(
    data=top_pairs,
    y="pair_label",  # 結合したラベルを使用
    x="count",
    palette="muted",
    orient="h",  # 水平バーチャート
)

# 値を棒グラフの中に表示
for i, p in enumerate(bars.patches):
    width = p.get_width()
    bars.text(
        width * 0.98,
        p.get_y() + p.get_height() / 2.0,
        f"{int(width)}",
        ha="right",
        va="center",
        color="white",
        fontweight="bold",
        fontsize=10,
    )

plt.title("最も一般的なトークンペア", fontsize=14)
plt.xlabel("件数")
plt.ylabel("")  # yラベルは不要
plt.tight_layout()
plt.show()

# ボーナス:フィーティア別の各種統計量のヒートマップ表示
# 主要な統計量のみを選択
key_stats = ["volume_usd_mean", "tvl_usd_mean", "tx_count_mean", "volume_deviation_anomaly_rate"]
stats_df = fee_tier_stats[["fee_tier"] + key_stats].set_index("fee_tier")

# フォーマットとラベル設定
formatted_labels = {
    "volume_usd_mean": "平均取引量",
    "tvl_usd_mean": "平均TVL",
    "tx_count_mean": "平均取引回数",
    "volume_deviation_anomaly_rate": "異常発生率",
}

# 正規化(各指標を0-1スケールに)
normalized_stats = stats_df.copy()
for col in key_stats:
    normalized_stats[col] = (stats_df[col] - stats_df[col].min()) / (stats_df[col].max() - stats_df[col].min())

plt.figure(figsize=(10, 8))
sns.heatmap(
    normalized_stats,
    annot=stats_df,  # 元の値を表示
    fmt=".2g",  # 一般形式で表示
    cmap="YlGnBu",
    linewidths=0.5,
    yticklabels=normalized_stats.index,
    xticklabels=[formatted_labels[col] for col in key_stats],
)
plt.title("フィーティア別の主要統計指標", fontsize=14)
plt.tight_layout()
plt.show()
No description has been provided for this image
/tmp/ipykernel_72670/4107957614.py:46: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  bar = sns.barplot(data=fee_tier_stats, x="fee_tier", y="volume_deviation_anomaly_rate", palette="viridis")
No description has been provided for this image
/tmp/ipykernel_72670/4107957614.py:68: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  bars = sns.barplot(
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

ベースラインモデルの定義¶

In [22]:
# 異常検知のための特徴選択
anomaly_features = ["volume_usd", "tvl_usd", "tx_count", "volume_deviation", "volume_tvl_ratio", "price_change"]

# 欠損値処理
model_df = enhanced_df.dropna(subset=anomaly_features)

# インデックスをリセット(重要: インデックスの問題を解決するため)
model_df = model_df.reset_index(drop=True)

# スケーリング
scaler = StandardScaler()
scaled_features = scaler.fit_transform(model_df[anomaly_features])

# Isolation Forestで異常検知
clf = IsolationForest(contamination=0.01, random_state=42)
model_df["anomaly_score"] = clf.fit_predict(scaled_features)
model_df["is_anomaly"] = model_df["anomaly_score"] == -1

# 検出された異常のみのデータフレーム
anomaly_df = model_df[model_df["is_anomaly"]].copy()

# プール名をより短く、読みやすくする
anomaly_df["pool_short"] = anomaly_df["pool_address"].apply(lambda x: x[:8] + "...")

# サイズ変数の調整(視覚的に見やすくするため)
if len(anomaly_df) > 0:
    size_var = np.abs(anomaly_df["volume_deviation"])
    if size_var.max() > size_var.min():
        size_var = 50 + (size_var - size_var.min()) / (size_var.max() - size_var.min()) * 200
    else:
        size_var = 100  # 全て同じ値の場合のデフォルトサイズ
else:
    size_var = []  # 空の場合

# 1. 検出された異常の可視化
if len(anomaly_df) > 0:
    plt.figure(figsize=(14, 8))

    # 色分け用のパレット設定(プール数に応じて色を生成)
    unique_pools = anomaly_df["pool_short"].nunique()
    palette = sns.color_palette("husl", unique_pools)

    # 散布図プロット
    scatter = sns.scatterplot(
        data=anomaly_df,
        x="datetime",
        y="volume_usd",
        hue="pool_short",
        size=size_var,
        sizes=(50, 250),  # サイズの範囲
        palette=palette,
        alpha=0.7,
        edgecolor="black",
        linewidth=0.5,
    )

    # グラフの設定
    plt.title("検出された異常(Isolation Forest)", fontsize=16)
    plt.xlabel("日時")
    plt.ylabel("取引量 (USD)")

    # x軸の日付フォーマット調整
    plt.gcf().autofmt_xdate()

    # 凡例調整(多すぎる場合は表示を制限)
    if unique_pools > 10:
        # 凡例を上位10件に制限
        handles, labels = scatter.get_legend_handles_labels()
        plt.legend(handles[:11], labels[:11], title="Pool", fontsize=9, loc="best")
    else:
        plt.legend(title="Pool", fontsize=9, loc="best")

    # 注釈追加(トップ5の異常値)
    if len(anomaly_df) >= 5:
        top_anomalies = anomaly_df.nlargest(5, "volume_usd")
    else:
        top_anomalies = anomaly_df

    for idx, row in top_anomalies.iterrows():
        plt.annotate(
            f"{row['token0_symbol']}-{row['token1_symbol']}",
            xy=(row["datetime"], row["volume_usd"]),
            xytext=(10, 0),
            textcoords="offset points",
            fontsize=9,
            bbox=dict(boxstyle="round,pad=0.3", fc="yellow", alpha=0.5),
        )

    plt.tight_layout()
    plt.show()
else:
    print("異常は検出されませんでした。")

# 2. プール別の異常数集計
# as_index=Falseを指定して明示的にカラムとして処理
anomaly_by_pool = model_df.groupby("pool_address", as_index=False)["is_anomaly"].sum()
anomaly_by_pool = anomaly_by_pool[anomaly_by_pool["is_anomaly"] > 0].sort_values("is_anomaly", ascending=False)

if len(anomaly_by_pool) > 0:
    top10_pools = anomaly_by_pool.head(10).copy()

    # プールアドレスを短縮
    top10_pools["pool_short"] = top10_pools["pool_address"].apply(lambda x: x[:10] + "...")

    plt.figure(figsize=(12, 7))
    bars = sns.barplot(
        data=top10_pools,
        x="is_anomaly",
        y="pool_short",
        palette="viridis",
        orient="h",  # 水平棒グラフ
    )

    # バーに数値を表示
    for i, p in enumerate(bars.patches):
        width = p.get_width()
        plt.text(
            width + 0.3,  # 少し右にオフセット
            p.get_y() + p.get_height() / 2,
            f"{int(width)}",
            ha="left",
            va="center",
        )

    plt.title("異常が最も多く検出されたプールTop10", fontsize=14)
    plt.xlabel("異常データ数")
    plt.ylabel("プールアドレス")
    plt.tight_layout()
    plt.show()
else:
    print("異常は検出されませんでした。")

# 3. ボーナス:トークンペア別の異常発生数
if len(anomaly_df) > 0:
    # トークンペア情報を結合
    anomaly_df["token_pair"] = anomaly_df["token0_symbol"] + "-" + anomaly_df["token1_symbol"]

    # トークンペア別の異常数を集計
    anomaly_by_pair = anomaly_df.groupby("token_pair").size().reset_index(name="count")
    top_pairs = anomaly_by_pair.sort_values("count", ascending=False).head(10)

    plt.figure(figsize=(12, 7))
    bars = sns.barplot(data=top_pairs, x="count", y="token_pair", palette="muted", orient="h")

    # バーに数値を表示
    for i, p in enumerate(bars.patches):
        width = p.get_width()
        plt.text(width + 0.1, p.get_y() + p.get_height() / 2, f"{int(width)}", ha="left", va="center")

    plt.title("異常が最も多く検出されたトークンペアTop10", fontsize=14)
    plt.xlabel("異常データ数")
    plt.ylabel("トークンペア")
    plt.tight_layout()
    plt.show()
No description has been provided for this image
/tmp/ipykernel_72670/1810225964.py:106: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  bars = sns.barplot(
No description has been provided for this image
/tmp/ipykernel_72670/1810225964.py:143: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  bars = sns.barplot(data=top_pairs, x="count", y="token_pair", palette="muted", orient="h")
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]: